import os
import urllib.request
import zipfile
import pandas as pd
import numpy as np

def download_sample_data():
    """
    创建一个示例数据集用于测试
    """
    np.random.seed(42)  # 设置随机种子以确保可重复性
    n_samples = 1000
    
    # 创建示例数据
    hours = list(range(24)) * (n_samples // 24 + 1)
    areas = ['Central', 'Rampart', 'Southwest', 'Hollywood', 'Harbor']
    crimes = ['BURGLARY', 'THEFT', 'ASSAULT', 'ROBBERY', 'VANDALISM']
    premises = ['STREET', 'RESIDENCE', 'VEHICLE', 'PARKING LOT', 'STORE']
    weapons = ['NONE', 'FIREARM', 'KNIFE', 'PHYSICAL FORCE', 'OTHER']
    
    data = {
        'DATE OCC': pd.date_range(start='2020-01-01', periods=n_samples),
        'TIME OCC': [str(h).zfill(4) for h in hours[:n_samples]],
        'AREA NAME': np.random.choice(areas, n_samples),
        'Crm Cd Desc': np.random.choice(crimes, n_samples),
        'LAT': [34.0522 + np.random.normal(0, 0.1) for _ in range(n_samples)],
        'LON': [-118.2437 + np.random.normal(0, 0.1) for _ in range(n_samples)],
        'Premis Desc': np.random.choice(premises, n_samples),
        'Weapon Desc': np.random.choice(weapons, n_samples)
    }
    
    # 创建DataFrame
    df = pd.DataFrame(data)
    
    # 创建目录
    os.makedirs('data/raw', exist_ok=True)
    
    # 保存数据
    df.to_csv('data/raw/crime-data-los-angeles.csv', index=False)
    print("Sample data created successfully!")

if __name__ == "__main__":
    print("Creating sample data for testing...")
    download_sample_data() 